# import libraries
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import math
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.metrics import f1_score, precision_score, recall_score
import warnings
warnings.filterwarnings("ignore")
import plotly.graph_objects as go
np.random.seed(1)
tf.random.set_seed(1)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, RepeatVector, TimeDistributed, Bidirectional
huwaei=pd.read_csv('training_1000.csv')
huwaei['timestamp']=pd.to_datetime(huwaei['timestamp'])
huwaei.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20159 entries, 0 to 20158 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 timestamp 20159 non-null datetime64[ns, pytz.FixedOffset(120)] 1 kpi_value 20159 non-null float64 2 request_count 20159 non-null int64 3 anomaly_label 20159 non-null int64 dtypes: datetime64[ns, pytz.FixedOffset(120)](1), float64(1), int64(2) memory usage: 630.1 KB
huwaei.head()
| timestamp | kpi_value | request_count | anomaly_label | |
|---|---|---|---|---|
| 0 | 2020-08-14 02:00:00+02:00 | 0.998755 | 24908 | 0 |
| 1 | 2020-08-14 02:01:00+02:00 | 0.997683 | 25029 | 0 |
| 2 | 2020-08-14 02:02:00+02:00 | 0.998341 | 24115 | 0 |
| 3 | 2020-08-14 02:03:00+02:00 | 0.998211 | 24031 | 0 |
| 4 | 2020-08-14 02:04:00+02:00 | 0.998403 | 23790 | 0 |
huwaei.tail()
| timestamp | kpi_value | request_count | anomaly_label | |
|---|---|---|---|---|
| 20154 | 2020-08-28 01:56:00+02:00 | 0.998149 | 26467 | 0 |
| 20155 | 2020-08-28 01:57:00+02:00 | 0.998340 | 26502 | 0 |
| 20156 | 2020-08-28 01:58:00+02:00 | 0.998364 | 26887 | 0 |
| 20157 | 2020-08-28 01:59:00+02:00 | 0.998428 | 26712 | 0 |
| 20158 | 2020-08-28 02:00:00+02:00 | 0.997407 | 29694 | 0 |
huwaei.dropna(how='any',inplace=False)
| timestamp | kpi_value | request_count | anomaly_label | |
|---|---|---|---|---|
| 0 | 2020-08-14 02:00:00+02:00 | 0.998755 | 24908 | 0 |
| 1 | 2020-08-14 02:01:00+02:00 | 0.997683 | 25029 | 0 |
| 2 | 2020-08-14 02:02:00+02:00 | 0.998341 | 24115 | 0 |
| 3 | 2020-08-14 02:03:00+02:00 | 0.998211 | 24031 | 0 |
| 4 | 2020-08-14 02:04:00+02:00 | 0.998403 | 23790 | 0 |
| ... | ... | ... | ... | ... |
| 20154 | 2020-08-28 01:56:00+02:00 | 0.998149 | 26467 | 0 |
| 20155 | 2020-08-28 01:57:00+02:00 | 0.998340 | 26502 | 0 |
| 20156 | 2020-08-28 01:58:00+02:00 | 0.998364 | 26887 | 0 |
| 20157 | 2020-08-28 01:59:00+02:00 | 0.998428 | 26712 | 0 |
| 20158 | 2020-08-28 02:00:00+02:00 | 0.997407 | 29694 | 0 |
20159 rows × 4 columns
huwaei.duplicated()
0 False
1 False
2 False
3 False
4 False
...
20154 False
20155 False
20156 False
20157 False
20158 False
Length: 20159, dtype: bool
huwaei.duplicated().sum()
0
huwaei.dtypes
timestamp datetime64[ns, pytz.FixedOffset(120)] kpi_value float64 request_count int64 anomaly_label int64 dtype: object
huwaei.isnull().sum()
timestamp 0 kpi_value 0 request_count 0 anomaly_label 0 dtype: int64
huwaei.isnull()
| timestamp | kpi_value | request_count | anomaly_label | |
|---|---|---|---|---|
| 0 | False | False | False | False |
| 1 | False | False | False | False |
| 2 | False | False | False | False |
| 3 | False | False | False | False |
| 4 | False | False | False | False |
| ... | ... | ... | ... | ... |
| 20154 | False | False | False | False |
| 20155 | False | False | False | False |
| 20156 | False | False | False | False |
| 20157 | False | False | False | False |
| 20158 | False | False | False | False |
20159 rows × 4 columns
kpi=huwaei['kpi_value']
timestamp=huwaei['timestamp']
label=huwaei['anomaly_label']
fig, ax2d = plt.subplots(figsize=(40,10))
ax2a = ax2d.twinx()
ax2d.plot(timestamp, kpi, color='blue')
ax2a.plot(timestamp, label, color='red', linewidth='1')
ax2d.set_xlabel('Time Stamp', fontsize=20)
ax2d.set_ylabel('KPI Value', fontsize=20)
ax2a.set_ylabel('Anomaly Label', fontsize=20)
plt.grid()
plt.title('Huwaei Dataset', fontsize=30)
Text(0.5, 1.0, 'Huwaei Dataset')
timestamp
arr=np.where(timestamp=='2020-08-24 00:00:00+02:00')
arr
(array([14280], dtype=int64),)
split1=timestamp[4200]
split2=timestamp[4800]
split3=timestamp[7080]
split4=timestamp[7680]
split5=timestamp[11400]
split6=timestamp[14280]
fig, ax2d = plt.subplots(figsize=(40,10))
ax2a = ax2d.twinx()
ax2d.plot(timestamp, kpi, color='blue')
ax2a.plot(timestamp, label, color='red', linewidth='2')
ax2d.set_xlabel('Time Stamp', fontsize=20)
ax2d.set_ylabel('KPI Value', fontsize=20)
ax2a.set_ylabel('Anomaly Label', fontsize=20)
ax2d.axvline(split1, color='green', linestyle='dashed', linewidth='2')
ax2d.axvline(split2, color='green', linestyle='dashed', linewidth='2')
ax2d.axvline(split3, color='green', linestyle='dashed', linewidth='2')
ax2d.axvline(split4, color='green', linestyle='dashed', linewidth='2')
ax2d.axvline(split5, color='green', linestyle='dashed', linewidth='2')
ax2d.axvline(split6, color='green', linestyle='dashed', linewidth='2')
plt.grid()
plt.title('Huwaei Dataset', fontsize=30)
print("Start date is: ", timestamp.min())
print("End date is: ", timestamp.max())
print(f"Length of full data: {len(huwaei)}")
Start date is: 2020-08-14 02:00:00+02:00 End date is: 2020-08-28 02:00:00+02:00 Length of full data: 20159
splits=[split1, split2, split3, split4, split5, split6]
splits
[Timestamp('2020-08-17 00:00:00+0200', tz='pytz.FixedOffset(120)'),
Timestamp('2020-08-17 10:00:00+0200', tz='pytz.FixedOffset(120)'),
Timestamp('2020-08-19 00:00:00+0200', tz='pytz.FixedOffset(120)'),
Timestamp('2020-08-19 10:00:00+0200', tz='pytz.FixedOffset(120)'),
Timestamp('2020-08-22 00:00:00+0200', tz='pytz.FixedOffset(120)'),
Timestamp('2020-08-24 00:00:00+0200', tz='pytz.FixedOffset(120)')]
# Training intervals
train1 = huwaei.loc[timestamp <= splits[0]]
train2 = huwaei.loc[(timestamp > splits[1]) & (timestamp <= splits[2])]
train3 = huwaei.loc[(timestamp > splits[3]) & (timestamp <= splits[4])]
train4 = huwaei.loc[timestamp > splits[5]]
# Testing intervals
test1 = huwaei.loc[(timestamp > splits[0]) & (timestamp <= splits[1])]
test2 = huwaei.loc[(timestamp > splits[2]) & (timestamp <= splits[3])]
test3 = huwaei.loc[(timestamp > splits[4]) & (timestamp <= splits[5])]
fig, ax2d = plt.subplots(figsize=(40,10))
ax2a = ax2d.twinx()
ax2d.plot(train1['timestamp'], train1['kpi_value'], color='blue')
ax2d.plot(test1['timestamp'], test1['kpi_value'], color='red')
ax2d.plot(train2['timestamp'], train2['kpi_value'], color='blue')
ax2d.plot(test2['timestamp'], test2['kpi_value'], color='red')
ax2d.plot(train3['timestamp'], train3['kpi_value'], color='blue')
ax2d.plot(test3['timestamp'], test3['kpi_value'], color='red')
ax2d.plot(train4['timestamp'], train4['kpi_value'], color='blue')
ax2a.plot(timestamp, label, color='green', linewidth='2')
ax2d.set_xlabel('Time Stamp', fontsize=20)
ax2d.set_ylabel('KPI Value', fontsize=20)
ax2a.set_ylabel('Anomaly Label', fontsize=20)
ax2d.axvline(split1, color='orange', linestyle='dashed', linewidth='2')
ax2d.axvline(split2, color='orange', linestyle='dashed', linewidth='2')
ax2d.axvline(split3, color='orange', linestyle='dashed', linewidth='2')
ax2d.axvline(split4, color='orange', linestyle='dashed', linewidth='2')
ax2d.axvline(split5, color='orange', linestyle='dashed', linewidth='2')
ax2d.axvline(split6, color='orange', linestyle='dashed', linewidth='2')
plt.grid()
plt.title('Huwaei Dataset', fontsize=30)
Text(0.5, 1.0, 'Huwaei Dataset')
# Preprocessing: normalize the data in mean and variance
scaler = StandardScaler()
scaler = scaler.fit(pd.concat([train1, train2, train3, train4])[['kpi_value', 'request_count']]) #To normalize the mean and variance
train1[['kpi_value', 'request_count']] = scaler.transform(train1[['kpi_value', 'request_count']])
train2[['kpi_value', 'request_count']] = scaler.transform(train2[['kpi_value', 'request_count']])
train3[['kpi_value', 'request_count']] = scaler.transform(train3[['kpi_value', 'request_count']])
train4[['kpi_value', 'request_count']] = scaler.transform(train4[['kpi_value', 'request_count']])
trainSeqs = [train1, train2, train3, train4]
# Avoid leakage of information between train and test dataset
test1[['kpi_value', 'request_count']] = scaler.transform(test1[['kpi_value', 'request_count']])
test2[['kpi_value', 'request_count']] = scaler.transform(test2[['kpi_value', 'request_count']])
test3[['kpi_value', 'request_count']] = scaler.transform(test3[['kpi_value', 'request_count']])
testSeqs = [test1, test2, test3]
# Compute the train to test ratio
learn_ratio = pd.concat(trainSeqs).shape[0] / huwaei.shape[0]
print("Training dataset percentage: " "{:.2f}" "%".format(learn_ratio*100))
test_ratio = pd.concat(testSeqs).shape[0] / huwaei.shape[0]
print("Testing dataset percentage: " "{:.2f}" "%".format(test_ratio*100))
Training dataset percentage: 79.76% Testing dataset percentage: 20.24%
trainSeqs
[ timestamp kpi_value request_count anomaly_label
0 2020-08-14 02:00:00+02:00 1.508552 1.765505 0
1 2020-08-14 02:01:00+02:00 1.243159 1.783991 0
2 2020-08-14 02:02:00+02:00 1.406095 1.644353 0
3 2020-08-14 02:03:00+02:00 1.373775 1.631520 0
4 2020-08-14 02:04:00+02:00 1.421287 1.594701 0
... ... ... ... ...
4196 2020-08-16 23:56:00+02:00 1.164309 1.031870 0
4197 2020-08-16 23:57:00+02:00 1.160132 1.069912 0
4198 2020-08-16 23:58:00+02:00 1.081624 0.943412 0
4199 2020-08-16 23:59:00+02:00 1.086746 0.860760 0
4200 2020-08-17 00:00:00+02:00 0.492590 9.208951 0
[4201 rows x 4 columns],
timestamp kpi_value request_count anomaly_label
4801 2020-08-17 10:01:00+02:00 -0.314337 -0.372455 0
4802 2020-08-17 10:02:00+02:00 -0.505045 -0.493149 0
4803 2020-08-17 10:03:00+02:00 -1.001756 -0.537760 0
4804 2020-08-17 10:04:00+02:00 -0.656774 -0.542191 0
4805 2020-08-17 10:05:00+02:00 -0.512424 -0.562968 0
... ... ... ... ...
7076 2020-08-18 23:56:00+02:00 1.274315 1.097412 0
7077 2020-08-18 23:57:00+02:00 1.178616 1.100773 0
7078 2020-08-18 23:58:00+02:00 1.222305 1.077245 0
7079 2020-08-18 23:59:00+02:00 1.317042 0.987412 0
7080 2020-08-19 00:00:00+02:00 0.122372 9.494949 0
[2280 rows x 4 columns],
timestamp kpi_value request_count anomaly_label
7681 2020-08-19 10:01:00+02:00 -0.771397 -0.360233 0
7682 2020-08-19 10:02:00+02:00 -0.809615 -0.384678 0
7683 2020-08-19 10:03:00+02:00 -0.715002 -0.382539 0
7684 2020-08-19 10:04:00+02:00 -0.774978 -0.450066 0
7685 2020-08-19 10:05:00+02:00 -0.977072 -0.497427 0
... ... ... ... ...
11396 2020-08-21 23:56:00+02:00 0.856348 0.400901 0
11397 2020-08-21 23:57:00+02:00 0.980600 0.401971 0
11398 2020-08-21 23:58:00+02:00 1.119249 0.345443 0
11399 2020-08-21 23:59:00+02:00 0.913249 0.303583 0
11400 2020-08-22 00:00:00+02:00 0.132003 7.406794 0
[3720 rows x 4 columns],
timestamp kpi_value request_count anomaly_label
14281 2020-08-24 00:01:00+02:00 -0.397133 4.243700 0
14282 2020-08-24 00:02:00+02:00 -0.526726 2.170211 0
14283 2020-08-24 00:03:00+02:00 -0.462762 1.790866 0
14284 2020-08-24 00:04:00+02:00 -0.609524 1.652603 0
14285 2020-08-24 00:05:00+02:00 0.370959 1.594701 0
... ... ... ... ...
20154 2020-08-28 01:56:00+02:00 1.358435 2.003684 0
20155 2020-08-28 01:57:00+02:00 1.405715 2.009032 0
20156 2020-08-28 01:58:00+02:00 1.411597 2.067851 0
20157 2020-08-28 01:59:00+02:00 1.427468 2.041115 0
20158 2020-08-28 02:00:00+02:00 1.174925 2.496696 0
[5878 rows x 4 columns]]
testSeqs
[ timestamp kpi_value request_count anomaly_label
4201 2020-08-17 00:01:00+02:00 -0.298217 4.215895 0
4202 2020-08-17 00:02:00+02:00 -0.174642 2.022476 0
4203 2020-08-17 00:03:00+02:00 0.387258 1.556812 0
4204 2020-08-17 00:04:00+02:00 0.446124 1.490659 0
4205 2020-08-17 00:05:00+02:00 0.582078 1.358966 0
... ... ... ... ...
4796 2020-08-17 09:56:00+02:00 -0.072083 -0.438761 0
4797 2020-08-17 09:57:00+02:00 -0.350213 -0.522177 0
4798 2020-08-17 09:58:00+02:00 -0.672569 -0.582065 0
4799 2020-08-17 09:59:00+02:00 -0.563406 -0.626371 0
4800 2020-08-17 10:00:00+02:00 -0.304437 -0.418136 0
[600 rows x 4 columns],
timestamp kpi_value request_count anomaly_label
7081 2020-08-19 00:01:00+02:00 -0.759512 4.386852 0
7082 2020-08-19 00:02:00+02:00 0.168237 2.064948 0
7083 2020-08-19 00:03:00+02:00 0.329986 1.596228 0
7084 2020-08-19 00:04:00+02:00 0.338085 1.513882 0
7085 2020-08-19 00:05:00+02:00 0.695776 1.433979 0
... ... ... ... ...
7676 2020-08-19 09:56:00+02:00 -0.175670 -0.427150 0
7677 2020-08-19 09:57:00+02:00 -0.606715 -0.433261 0
7678 2020-08-19 09:58:00+02:00 -0.454601 -0.492080 0
7679 2020-08-19 09:59:00+02:00 -0.918436 -0.478177 0
7680 2020-08-19 10:00:00+02:00 -0.499805 -0.326470 0
[600 rows x 4 columns],
timestamp kpi_value request_count anomaly_label
11401 2020-08-22 00:01:00+02:00 -1.145829 3.076637 0
11402 2020-08-22 00:02:00+02:00 -0.777070 1.166314 0
11403 2020-08-22 00:03:00+02:00 -0.110366 0.824094 0
11404 2020-08-22 00:04:00+02:00 -0.140076 0.722650 0
11405 2020-08-22 00:05:00+02:00 0.316230 0.681094 0
... ... ... ... ...
14276 2020-08-23 23:56:00+02:00 0.106058 1.186481 0
14277 2020-08-23 23:57:00+02:00 0.795930 1.293424 0
14278 2020-08-23 23:58:00+02:00 0.695386 1.230480 0
14279 2020-08-23 23:59:00+02:00 -0.039992 1.115898 0
14280 2020-08-24 00:00:00+02:00 0.334182 9.460269 0
[2880 rows x 4 columns]]
TIME_STEPS = 28 #Could change the timesteps default=32
# Create the sequences of size TIME_STEPS to feed the sequential model
def to_sequences(x, y, TIME_STEPS=1):
x_values = []
y_values = []
for i in range(len(x)-TIME_STEPS):
x_values.append(x.iloc[i:(i+TIME_STEPS)].values)
y_values.append(y.iloc[i+1:(i+TIME_STEPS+1)])
return np.array(x_values), np.array(y_values)
# Stack the sequences obtained in each period to create a single training tensor
trainX = np.array([])
trainY = np.array([])
testX = np.array([])
testY = np.array([])
for train in trainSeqs:
x, y = to_sequences(train[['kpi_value']], train[['kpi_value']], TIME_STEPS)
trainX = np.vstack([trainX, x]) if trainX.size else x
trainY = np.vstack([trainY, y]) if trainY.size else y
for test in testSeqs:
x, y = to_sequences(test[['kpi_value']], test[['kpi_value']], TIME_STEPS)
testX = np.vstack([testX, x]) if testX.size else x
testY = np.vstack([testY, y]) if testY.size else y
print(f"Train X shape: {trainX.shape[0]} batches, {trainX.shape[1]} values for each batch, {trainX.shape[2]} dimensional batch")
print(f"Train Y shape: {trainY.shape[0]} batches, {trainY.shape[1]} values for each batch, {trainY.shape[2]} dimensional batch")
print(f"Test X shape: {testX.shape[0]} batches, {testX.shape[1]} values for each batch, {testX.shape[2]} dimensional batch")
print(f"Test Y shape: {testY.shape[0]} batches, {testY.shape[1]} values for each batch, {testY.shape[2]} dimensional batch")
# 15951 + 3984 + (32*7) = 20159
# 32 values for each sample
# 3984 testing batches
Train X shape: 15967 batches, 28 values for each batch, 1 dimensional batch Train Y shape: 15967 batches, 28 values for each batch, 1 dimensional batch Test X shape: 3996 batches, 28 values for each batch, 1 dimensional batch Test Y shape: 3996 batches, 28 values for each batch, 1 dimensional batch
print(trainX[0])
print("")
print(trainX[1])
[[1.50855227] [1.24315929] [1.40609472] [1.37377529] [1.42128722] [1.39961253] [1.44074339] [1.49781123] [1.58274419] [1.4783101 ] [1.47234347] [1.40898944] [1.283624 ] [1.47115683] [1.47085535] [1.46514692] [1.38556393] [1.34201061] [1.48229737] [1.51873494] [1.4757048 ] [1.54065301] [1.49302045] [1.49090656] [1.52805407] [1.47927088] [1.44423395] [1.54526938]] [[1.24315929] [1.40609472] [1.37377529] [1.42128722] [1.39961253] [1.44074339] [1.49781123] [1.58274419] [1.4783101 ] [1.47234347] [1.40898944] [1.283624 ] [1.47115683] [1.47085535] [1.46514692] [1.38556393] [1.34201061] [1.48229737] [1.51873494] [1.4757048 ] [1.54065301] [1.49302045] [1.49090656] [1.52805407] [1.47927088] [1.44423395] [1.54526938] [1.55088521]]
print(trainX[0])
print("")
print(trainY[0])
[[1.50855227] [1.24315929] [1.40609472] [1.37377529] [1.42128722] [1.39961253] [1.44074339] [1.49781123] [1.58274419] [1.4783101 ] [1.47234347] [1.40898944] [1.283624 ] [1.47115683] [1.47085535] [1.46514692] [1.38556393] [1.34201061] [1.48229737] [1.51873494] [1.4757048 ] [1.54065301] [1.49302045] [1.49090656] [1.52805407] [1.47927088] [1.44423395] [1.54526938]] [[1.24315929] [1.40609472] [1.37377529] [1.42128722] [1.39961253] [1.44074339] [1.49781123] [1.58274419] [1.4783101 ] [1.47234347] [1.40898944] [1.283624 ] [1.47115683] [1.47085535] [1.46514692] [1.38556393] [1.34201061] [1.48229737] [1.51873494] [1.4757048 ] [1.54065301] [1.49302045] [1.49090656] [1.52805407] [1.47927088] [1.44423395] [1.54526938] [1.55088521]]
model = Sequential()
model.add(layers.LSTM(128, input_shape=(trainX.shape[1], trainX.shape[2])))
model.add(layers.Dropout(rate=0.2))
model.add(layers.RepeatVector(trainX.shape[1]))
model.add(layers.LSTM(128, return_sequences=True))
model.add(layers.Dropout(rate=0.2))
model.add(layers.TimeDistributed(layers.Dense(1)))
model.compile(optimizer='adam', loss='mae')
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm (LSTM) (None, 128) 66560
dropout (Dropout) (None, 128) 0
repeat_vector (RepeatVector (None, 28, 128) 0
)
lstm_1 (LSTM) (None, 28, 128) 131584
dropout_1 (Dropout) (None, 28, 128) 0
time_distributed (TimeDistr (None, 28, 1) 129
ibuted)
=================================================================
Total params: 198,273
Trainable params: 198,273
Non-trainable params: 0
_________________________________________________________________
#to give the minimum loss you should increase the number of epochs
# Fit model
history = model.fit(trainX, trainY, epochs=5, batch_size=32, validation_split=0.1, verbose=1, callbacks=[
keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, mode="min")
])
Epoch 1/5 450/450 [==============================] - 61s 135ms/step - loss: 0.2238 - val_loss: 0.1781 Epoch 2/5 450/450 [==============================] - 60s 133ms/step - loss: 0.2206 - val_loss: 0.1669 Epoch 3/5 450/450 [==============================] - 64s 143ms/step - loss: 0.2175 - val_loss: 0.1615 Epoch 4/5 450/450 [==============================] - 67s 148ms/step - loss: 0.2141 - val_loss: 0.1732 Epoch 5/5 450/450 [==============================] - 64s 143ms/step - loss: 0.2108 - val_loss: 0.1605
model.evaluate(testX,testY)
125/125 [==============================] - 5s 44ms/step - loss: 0.1982
0.19821898639202118
# Plot the training and validation loss
plt.figure(figsize=(40,10))
plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.legend()
<matplotlib.legend.Legend at 0x1fc57c69970>
# Plot a histogram of the reconstruction error in the training dataset to decide a threshold
trainPredict = model.predict(trainX)
print(f"Train Predict Shape {trainPredict.shape}")
print(f"Train Y Shape {trainY.shape}")
trainMAE = np.mean(np.abs(trainPredict - trainY), axis=1)
plt.figure(figsize=(40,10));
plt.hist(trainMAE, bins=30);
plt.legend(['kpi_value_prediction_error'])
threshold_trainMAE = 0.30 #or Define 90% value of max as threshold.
Train Predict Shape (15967, 28, 1) Train Y Shape (15967, 28, 1)
# Histogram of the testing MAE
testPredict = model.predict(testX)
testMAE = np.mean(np.abs(testPredict - testY), axis=1)
plt.figure(figsize=(40,10));
plt.hist(testMAE, bins=30);
# Detect anomaly if the reconstruction loss for a sample is greater than the threshold
anomaly_df = pd.concat([seq[TIME_STEPS:] for seq in testSeqs])
anomaly_df['testMAE'] = testMAE
anomaly_df['threshold_trainMAE'] = threshold_trainMAE
anomaly_df['anomaly'] = anomaly_df['testMAE'] > anomaly_df['threshold_trainMAE']
plt.figure(figsize=(40,10))
for i in range(trainPredict.shape[0]):
plt.plot(trainPredict[i])
plt.figure(figsize=(40,10))
for i in range(testPredict.shape[0]):
plt.plot(testPredict[i])
testPredict.shape[0]
3996
# Plot the test MAE
plt.figure(figsize=[40, 10])
anomaly_df['threshold_trainMAE'].plot()
anomaly_df['testMAE'].plot()
plt.legend()
<matplotlib.legend.Legend at 0x1fc5bdb6f70>
# Detect the anomaly points inside the dataset
anomalies = anomaly_df.loc[anomaly_df['anomaly'] == True]
#Plot the anomalies
plt.figure(figsize=(40,10))
anomaly_df['kpi_value'].plot()
anomaly_df['anomaly_label'].plot()
anomalies['kpi_value'].plot(marker='.', linestyle='None', label='anomaly_detected', color='black')
plt.legend()
<matplotlib.legend.Legend at 0x1fc5a11a8e0>
# Inspect the resulting dataset
anomaly_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 3996 entries, 4229 to 14280 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 timestamp 3996 non-null datetime64[ns, pytz.FixedOffset(120)] 1 kpi_value 3996 non-null float64 2 request_count 3996 non-null float64 3 anomaly_label 3996 non-null int64 4 testMAE 3996 non-null float64 5 threshold_trainMAE 3996 non-null float64 6 anomaly 3996 non-null bool dtypes: bool(1), datetime64[ns, pytz.FixedOffset(120)](1), float64(4), int64(1) memory usage: 222.4 KB
anomaly_df
| timestamp | kpi_value | request_count | anomaly_label | testMAE | threshold_trainMAE | anomaly | |
|---|---|---|---|---|---|---|---|
| 4229 | 2020-08-17 00:29:00+02:00 | 1.063615 | 1.123078 | 0 | 0.115438 | 0.3 | False |
| 4230 | 2020-08-17 00:30:00+02:00 | 0.934094 | 1.215661 | 0 | 0.109272 | 0.3 | False |
| 4231 | 2020-08-17 00:31:00+02:00 | 0.991987 | 1.260883 | 0 | 0.120696 | 0.3 | False |
| 4232 | 2020-08-17 00:32:00+02:00 | 1.070022 | 1.302133 | 0 | 0.123432 | 0.3 | False |
| 4233 | 2020-08-17 00:33:00+02:00 | 1.046498 | 1.249119 | 0 | 0.125366 | 0.3 | False |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 14276 | 2020-08-23 23:56:00+02:00 | 0.106058 | 1.186481 | 0 | 0.166797 | 0.3 | False |
| 14277 | 2020-08-23 23:57:00+02:00 | 0.795930 | 1.293424 | 0 | 0.163999 | 0.3 | False |
| 14278 | 2020-08-23 23:58:00+02:00 | 0.695386 | 1.230480 | 0 | 0.167638 | 0.3 | False |
| 14279 | 2020-08-23 23:59:00+02:00 | -0.039992 | 1.115898 | 0 | 0.191947 | 0.3 | False |
| 14280 | 2020-08-24 00:00:00+02:00 | 0.334182 | 9.460269 | 0 | 0.217745 | 0.3 | False |
3996 rows × 7 columns
# Compute the precision in the test dataset
precision=precision_score(anomaly_df['anomaly_label'], anomaly_df['anomaly'])
print(f"Precision: {round(precision, 3)}")
# Compute the recall in the test dataset
recall=recall_score(anomaly_df['anomaly_label'], anomaly_df['anomaly'])
print(f"Recall: {round(recall, 3)}")
# Compute the F1 score in the test dataset
f1_score=f1_score(anomaly_df['anomaly_label'], anomaly_df['anomaly'])
print(f"F1 Score: {round(f1_score, 3)}")
Precision: 0.614 Recall: 0.704 F1 Score: 0.656